Overview

This document contains plots and statistical analyses for Homm et al. “Can Untargeted RNA-Seq of Urine Samples Diagnose UTIs in Children?”.

Loading packages

library(DESeq2)
library(ggplot2)
library(biomaRt)
library(EnhancedVolcano)
library(factoextra)
library(org.Hs.eg.db)
library(ensembldb)
library(tximport)
library(AnnotationHub)
library(clusterProfiler)
library(AnnotationDbi)
library(ggvenn)
library(ComplexHeatmap)
library(reshape2)
library(dplyr)
library(tidyr)
library(readxl)
library(pROC)
library(circlize)
library(ggpubr)
library(effsize)
library(mclust)
library(classInt)
library(patchwork)
library(grid)
library(eulerr)
library(devtools)
library(xCell)
library(circlize)
library(plotly)
library(showtext)

# List of your packages
pkgs <- c(
  "DESeq2", "ggplot2", "biomaRt", "EnhancedVolcano", "factoextra", "org.Hs.eg.db",
  "ensembldb", "tximport", "AnnotationHub", "clusterProfiler", "AnnotationDbi",
  "ggvenn", "ComplexHeatmap", "reshape2", "dplyr", "tidyr", "readxl", "pROC",
  "circlize", "ggpubr", "effsize", "mclust", "classInt", "patchwork", "grid",
  "eulerr", "devtools", "xCell", "plotly"
)

# function to get version
get_version <- function(pkg) {
  if (requireNamespace(pkg, quietly = TRUE)) {
    as.character(packageVersion(pkg))
  } else {
    "Not Installed"
  }
}

# create named vector of versions
pkg_versions <- sapply(pkgs, get_version)

# print nicely
versions_df <- data.frame(Package = names(pkg_versions), Version = pkg_versions)
print(versions_df, row.names = FALSE)
##          Package Version
##           DESeq2  1.48.1
##          ggplot2   3.5.2
##          biomaRt  2.64.0
##  EnhancedVolcano  1.26.0
##       factoextra   1.0.7
##     org.Hs.eg.db  3.21.0
##        ensembldb  2.32.0
##         tximport  1.36.0
##    AnnotationHub  3.16.0
##  clusterProfiler  4.16.0
##    AnnotationDbi  1.70.0
##           ggvenn  0.1.10
##   ComplexHeatmap  2.24.0
##         reshape2   1.4.4
##            dplyr   1.1.4
##            tidyr   1.3.1
##           readxl   1.4.5
##             pROC  1.18.5
##         circlize  0.4.16
##           ggpubr   0.6.0
##          effsize   0.8.1
##           mclust   6.1.1
##         classInt  0.4.11
##        patchwork   1.3.0
##             grid   4.5.0
##           eulerr   7.0.2
##         devtools   2.4.5
##            xCell   1.1.0
##           plotly  4.10.4

Load in necessary objects

#' Contains RPM data for all species, along with corresponding metadata
load("../data/reads.meta.counts.239.RData")

#' Contains txi object with all Salmon-mapped human gene expression levels
load("../data/txi.data.239.RData")

#' Contains just RPM data for all species
load("../data/rpmdata.239.RData")

# Temporary: test out for bracken instead
load("../data/bracken_rpm_496.RData")
good.patients <- reads.meta.counts.239$ptid
rpmdata.239 <- bracken_rpm_496[as.character(good.patients) , ] 
print(dim(rpmdata.239))
## [1]  239 5956
# Incorporate this better, but these patients had dirty tubes (hence should be removed)
patients.to.remove <- c("15887", "15963", "15985", "15991", "15994", "16003", 
                        "16050", "15860", "15900", "15906", "15927", "16018", 
                        "16089")

1. Salmon-based unsupervised host-response analysis

  1. Filter out patients with <= 5000 human genes, and genes with mean count < 1, then run DESeq2 with no condition (unsupervised)
# ID patients that express over 5000 human genes
human.gene.counts <- apply(txi.data.239$counts, 2, function(x) sum(x != 0))
human.gene.over.thres <- subset(human.gene.counts, human.gene.counts > 5000)
human.gene.over.thres <- human.gene.over.thres[na.omit(names(human.gene.over.thres))]

# Remove the dirty patient tubes
human.gene.over.thres <- human.gene.over.thres[!names(human.gene.over.thres) %in% patients.to.remove]

# Subset txi.data to only include these patients
txi.data.239$abundance <- subset(txi.data.239$abundance, select = c(names(human.gene.over.thres)))
txi.data.239$counts <- subset(txi.data.239$counts, select = c(names(human.gene.over.thres)))
txi.data.239$length <- subset(txi.data.239$length, select = c(names(human.gene.over.thres)))

# Calculate the mean counts of each gene
gene_means <- rowMeans(txi.data.239$counts)

# Filter out genes with mean counts less than 1
txi.data.239$counts <- txi.data.239$counts[gene_means > 1, ]
txi.data.239$abundance <- txi.data.239$abundance[gene_means > 1, ]
txi.data.239$length <- txi.data.239$length[gene_means > 1, ]

print(dim(txi.data.239$abundance))
## [1] 41898   207
col_data <- data.frame(row.names = colnames(txi.data.239$counts), condition = factor(rep(1, ncol(txi.data.239$counts))))
col_data <- cbind(col_data, reads.meta.counts.239[human.gene.over.thres , ])
dds.unsupervised <- DESeqDataSetFromTximport(txi.data.239, colData = col_data, design = ~1)
dds.unsupervised <- DESeq(dds.unsupervised)

# Initialize df's to be used in downstream analysis
reads.meta.counts <- reads.meta.counts.239[names(human.gene.over.thres) , ]
rpmdata <- rpmdata.239[names(human.gene.over.thres) , ]
txi.data <- txi.data.239
  1. Proportion of human and non-human reads across the patients
# Obtain reads data
reads <- reads.meta.counts$total.reads
human.reads <- reads.meta.counts$human.reads
data <- data.frame(
  patient = 1:length(human.reads),
  non_human_reads = reads - human.reads,
  human_reads = human.reads
)

# Compute proportion of human reads
data$human_read_proportion <- data$human_reads / reads

# Order patients by human read proportion
data <- data %>% arrange(human_read_proportion)

# Convert to long format for stacked bar plot
long_data <- data %>%
  pivot_longer(cols = c(human_reads, non_human_reads), names_to = "Read Type", values_to = "value")

# Ensure "human_reads" is the first level so it appears at the bottom
long_data$`Read Type` <- factor(long_data$`Read Type`, levels = c("human_reads", "non_human_reads"))

# Create the stacked bar plot
stacked_bar <- ggplot(long_data, aes(x = factor(patient, levels = data$patient), y = value, fill = `Read Type`)) +
  geom_col(position = "stack") +
  scale_fill_manual(values = c("human_reads" = "blue", "non_human_reads" = "red")) +
  labs(x = "214 Patients", y = "Number of Reads", fill = "Read Type") +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    panel.grid = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 15)
  )


# Create a thin heatmap at the bottom
heatmap <- ggplot(data, aes(x = factor(patient, levels = data$patient), y = 1, fill = human_read_proportion)) +
  geom_tile() +
  scale_fill_gradient(low = "grey", high = "darkgreen", 
                      limits = c(0, 1),
                      breaks = c(0, 0.25, 0.5, 0.75, 1)) +
  labs(fill = "Human Read Proportion") +
  theme_void() +
  theme(
    legend.position = "bottom",
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 12), 
    legend.key.height = unit(0.5, "cm"),  
    legend.key.width = unit(1.5, "cm"),
    aspect.ratio = 0.05,
    legend.title.align = -1, 
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
  )

# Combine both plots using patchwork with adjusted heights
final_plot <- stacked_bar / heatmap + plot_layout(heights = c(4, 1))

print(final_plot)

  1. Stabilize the data and run PCA
normalized_counts <- counts(dds.unsupervised, normalized = TRUE)
vsd <- vst(dds.unsupervised, blind = TRUE)
pca_data <- plotPCA(vsd, returnData = TRUE)
pca_df <- data.frame(
  PC1 = pca_data$PC1,
  PC2 = pca_data$PC2,
  Sample = rownames(pca_data)
  )
  1. Same PCA, but plot in 3D
# mimic plotPCA internals to get more PCs
ntop <- 500
rv <- rowVars(assay(vsd))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
pca <- prcomp(t(assay(vsd)[select, ]))

# make data frame for plotting
pca_df_3d <- data.frame(
  PC1 = pca$x[,1],
  PC2 = pca$x[,2],
  PC3 = pca$x[,3]
)

# interactive 3D plot
plot_ly(
  data = pca_df_3d,
  x = ~PC1, y = ~PC2, z = ~PC3,
  type = "scatter3d",
  mode = "markers"
)
# Extract the variance of each
summary(pca)$importance[2, 1]
## [1] 0.70994
summary(pca)$importance[2, 2]
## [1] 0.03683
summary(pca)$importance[2, 3]
## [1] 0.02862
  1. k-means clustering
set.seed(1)
# Run k-means clustering, k = 2
kmeans_result_pca <- kmeans(pca_df[ , c("PC1", "PC2")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df$cluster <- as.factor(kmeans_result_pca$cluster)

cluster_colors <- c("red", "blue")

# Create the plot comparing PC1 and PC2
k_means_plot_pca <- ggplot(pca_df, aes(x = PC1, y = PC2, color = factor(cluster))) +
  geom_point(size = 3, alpha = 0.8) +
  stat_ellipse(aes(fill = factor(cluster)),   geom = "polygon",
  alpha = 0.3,
  color = "black",    # outline to make visible
  level = 0.99,
  show.legend = TRUE) +
  labs(
    title = "PCA Plot with K-means Clusters (k = 2)",
    x = "PC1",
    y = "PC2",
    color = "Cluster",
    fill = "Cluster"
  ) +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold"),
    legend.position = "right",
    legend.text = element_text(size = 14),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1.5)
  ) +
  scale_color_manual(values = cluster_colors) +
  scale_fill_manual(values = cluster_colors)

# Display the plot
k_means_plot_pca

  1. Same PCA + k-means, but in 3D this time
# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df_3d$cluster <- as.factor(kmeans_result_pca_3d$cluster)

# Set color palette
cluster_colours <- c("1" = "red", "2" = "blue")

# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
  data = pca_df_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~cluster,
  colors = cluster_colours,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with K-means Clusters (k = 2)", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
k_means_plot_3d
# library(reticulate)
# 
# save_image(
#   k_means_plot_3d,
#   file   = "3D_test.jpeg",
#   width  = 3600,     # px
#   height = 2400,     # px
#   scale  = 1         # no additional scaling
# )
  1. DESeq2 comparing both groups
# Re-name the clusters to 0/1
patients.over.thres <- names(human.gene.over.thres)
k_groups <- kmeans_result_pca$cluster
k_groups <- ifelse(k_groups == 2, 0, 1)
names(k_groups) <- patients.over.thres

# Create DESeq metadata
k_groups_metadata <- data.frame(group = k_groups, 
                                group_word = sapply(k_groups, function(x) if (x == 1) {return("uti")} else {return("no_uti")}))

# Run DESeq on cluster 1 vs. cluster 2, using the original txi.data
dds.kmeans <- DESeqDataSetFromTximport(txi.data, colData = k_groups_metadata, design = ~group_word)

# Brief moment to run vst again
vst.for.heatmap <- varianceStabilizingTransformation(dds.kmeans, blind = TRUE)

# Run DESeq2
dds.kmeans <- DESeq(dds.kmeans)

# Isolate the volcano info
res.human <- results(dds.kmeans, contrast = c("group_word", "uti", "no_uti"))

# Map ensembl ids to their actual gene names
res.df.human <- as.data.frame(res.human)
res.df.human$symbol <- mapIds(org.Hs.eg.db, keys = rownames(res.df.human), keytype = "ENSEMBL", column = "SYMBOL")

genes_to_label <- c("LIMK2", "CSF3R", "BCL6", "NFKB1", "SELL", "JAK3")

# Create the volcano
volc <- EnhancedVolcano(res.df.human, 
                        x = "log2FoldChange", 
                        y = "padj", 
                        lab = res.df.human$symbol, 
                        selectLab = genes_to_label,  
                        drawConnectors = TRUE, 
                        pointSize = 3.0, 
                        title = "DESeq2 with K-means Groups", 
                        subtitle = "",
                        FCcutoff = 1, 
                        pCutoff = 0.05)

volc

  1. Perform biological process enrichment on these up/down-regulated genes
enrichment <- function(res.df, padj_thres, log2fc_thres, up.or.down, save = FALSE) {

  if (up.or.down == "up") {
    sig.genes <- rownames(res.df[res.df$log2FoldChange > log2fc_thres & res.df$padj < padj_thres, ])
    sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
    title = "Upregulated Pathways"
  } else {
    sig.genes <- rownames(res.df[res.df$log2FoldChange < -log2fc_thres & res.df$padj < padj_thres, ])
    sig.genes <- sig.genes[!is.na(sig.genes) & !grepl("^NA", sig.genes)]
    title = "Downregulated Pathways"
  }
  
  # Run GO analysis
  GO_results <- enrichGO(gene = sig.genes, OrgDb = "org.Hs.eg.db", keyType = "ENSEMBL", ont = "BP")
  GO_results_df <- as.data.frame(GO_results)
  
  # Print p-values for debugging
  print("Original p.adjust values:")
  print(head(GO_results_df$p.adjust))
  
  # Create plot using the enrichResult object (GO_results)
  fit.up <- barplot(GO_results, showCategory = 10) +
    xlab("Number of Genes") + 
    ggtitle(title) + 
    theme(
      plot.title = element_text(size = 15, face = "bold", hjust = 0.5), 
      axis.text.y = element_text(size = 12)
    ) +
    scale_fill_gradient(
      low = "#e73512", 
      high = "#ee794d", 
      name = "Adjusted p-value",
      trans = "log10",  # Log transform for small p-values
      labels = scales::scientific_format()  # Force scientific notation
    )
  
  if (save) {
    ggsave(
      filename = paste0(up.or.down, ".pdf"), 
      plot = fit.up, 
      device = "pdf", 
      width = 12,
      height = 10
    )
  }
  
  print(paste("Number of significant genes:", length(sig.genes)))
  return(fit.up)
}

# Run enrichment function on up/down-regulated genes
upreg.plot <- enrichment(res.df.human, 0.05, 1, "up", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.386622e-43 1.508425e-41 4.558586e-40 5.455905e-40 9.053207e-38
## [6] 2.740763e-35
## [1] "Number of significant genes: 3384"
downreg.plot <- enrichment(res.df.human, 0.05, 1, "down", save = FALSE)
## [1] "Original p.adjust values:"
## [1] 1.100490e-30 8.773306e-28 1.126552e-26 1.069060e-23 3.835198e-23
## [6] 9.015425e-22
## [1] "Number of significant genes: 6193"
print(upreg.plot)

print(downreg.plot)

2. Kraken2 and Bracken-based taxonomic classification

  1. Compute pathogen abundance scores
# Remove human reads
df_no_human <- rpmdata[, !colnames(rpmdata) %in% "Homo sapiens"]
total_rpm <- rowSums(df_no_human)

# Normalize dataframe back to RPM
df_renormalized <- as.data.frame(sweep(df_no_human, 1, total_rpm, "/") * 1e6)
df_renormalized[is.na(df_renormalized)] <- 0
df_renormalized <- df_renormalized[patients.over.thres , ]

# Seven common UTI-associated pathogens
seven.pathogens <- c("Escherichia coli", "Klebsiella pneumoniae", "Enterococcus faecalis", "Proteus mirabilis", 
                     "Pseudomonas aeruginosa", "Staphylococcus saprophyticus", "Streptococcus agalactiae") 

# Subset to only include reads for these seven pathogens
df_subset <- df_renormalized[ , colnames(df_renormalized) %in% seven.pathogens]
rpm_sums <- rowSums(df_subset, na.rm = TRUE)
rpm_sums <- rpm_sums[patients.over.thres]

# Logs the final summation of normalized pathogen abundance
rpm_sums <- sapply(rpm_sums, function(x) max(0, log10(x)))

# # Look at k-means clustering of rpm_sums
# km <- kmeans(10^rpm_sums, centers = 2)
# centers <- sort(km$centers)
# threshold <- mean(centers)
# cat("K-means threshold =", threshold, "\n")

# # Run jenks classification algorithm to find optimal threshold
# pathogen_abundance = rpm_sums
# breaks <- classIntervals(pathogen_abundance, n=2, style="jenks")
# threshold <- max(breaks$brks[-length(breaks$brks)])
# print(paste0("Threshold = ", threshold))

# Look at kernel density and find derivatives
d <- density(rpm_sums)
x_vals <- d$x
y_vals <- d$y
minima_idx <- which(diff(sign(diff(y_vals))) == 2) + 1
minima_x <- x_vals[minima_idx]
minima_y <- y_vals[minima_idx]
print(minima_x)
## [1] 1.067306 4.755726
plot(d, main = "Density of rpm_sums with local minima")
points(minima_x, minima_y, col = "red", pch = 19)
abline(v = minima_x, col = "red", lty = 2)

threshold <- minima_x[2]

# View dist of these abundance scores
hist(rpm_sums, breaks = 12, main = title("Logged RPM, Human Reads Excluded"))
abline(v = threshold, col = "red", lty = 2, lwd = 2)

# View pathogen abundance scores across random y-axis

set.seed(1234)

# Create a data frame with random y-values
abundance.df <- data.frame(
  x = rpm_sums,
  y = runif(length(rpm_sums), min = 0, max = 1), 
  colour = ifelse(rpm_sums > threshold, "Positive", "Negative")
)

rpm.jitter <- ggplot(abundance.df, aes(x = x, y = y, color = colour)) +
  geom_jitter(height = 0, width = 0.1, alpha = 0.6, size = 2.5) +
  geom_vline(xintercept = threshold, color = "black", linetype = "dashed", linewidth = 1) +
  labs(
    title = "RPM Sum Distribution",
    x = expression(log[10] ~ "(Pathogen Abundance Score, RPM)"),
    y = "Random Y-Value",
    color = "Cluster"
  ) +
  scale_color_manual(values = c("Positive" = "red", "Negative" = "blue")) +
  theme_classic(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"), 
    axis.title = element_text(),  
    legend.position = "right",
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1) 
    
  )

print(rpm.jitter)

  1. Bubble plot observing pathogen abundance across the patients
heatmap.df <- t(df_renormalized[patients.over.thres, seven.pathogens])
heatmap.df <- apply(heatmap.df, c(1, 2), function(x) max(log10(x), 0))
rownames(heatmap.df) <- sapply(rownames(heatmap.df), function(x) gsub(" ", "\n", x))

# Convert the heatmap.df matrix to a long-format data frame
bubble_data <- melt(heatmap.df)
colnames(bubble_data) <- c("Row", "Column", "Value")

# Get the E coli row values
e_coli_values <- heatmap.df["Escherichia\ncoli", ]

# Reorder the columns based on the E coli values (descending order)
bubble_data$Column <- factor(bubble_data$Column, levels = names(sort(e_coli_values, decreasing = TRUE)))

bubble_plot <- ggplot(bubble_data, aes(x = Column, y = Row, size = Value, fill = Value)) +
  geom_point(shape = 21, color = "black") +
  scale_size_continuous(range = c(1, 10)) +
  scale_fill_gradient(low = "white", high = "red") +  
  labs(
    x = "214 Patients",  
    y = "Pathogen",
    size = "log10(RPM)",
    fill = "log10(RPM)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(), 
    axis.text.y = element_text(size = 12, face = "italic"),
    axis.title.x = element_text(size = 12), 
    axis.title.y = element_text(size = 12), 
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1) 
  )

print(bubble_plot)

  1. ROC and AUC: agreement between Kraken-derived RPM scores and clinical diagnostics
log.reg.df <- data.frame(df_subset)

# Ecoli

log.reg.df$ecoli <- ifelse(reads.meta.counts$ecoli == 1, 1, 0)  
log.reg.df$ecoli[is.na(log.reg.df$ecoli)] <- 0  

ecoli <- glm(ecoli ~ Escherichia.coli, data = log.reg.df, family = binomial)  
ecoli_probs <- predict(ecoli, type = "response")
auc_ecoli <- roc(log.reg.df$ecoli, ecoli_probs)
print(auc_ecoli)
## 
## Call:
## roc.default(response = log.reg.df$ecoli, predictor = ecoli_probs)
## 
## Data: ecoli_probs in 104 controls (log.reg.df$ecoli 0) < 103 cases (log.reg.df$ecoli 1).
## Area under the curve: 0.9969
plot(auc_ecoli, col = "blue", main = "ROC Curve for E. coli")

# Extract optimal information
optimal_threshold <- coords(auc_ecoli, "best", ret = "threshold")[1]
predicted_classes <- ifelse(ecoli_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = ecoli_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     1     1     1     0     1     0     1     0     0     1     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     1     1     1     1     0     1     0     1     1     0     1     0     1 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     1     1     1     0     0     1     0     0     1     1     0     0     1 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     1     1     0     0     1     0     1     0     1     1     1     1 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     1     1     1     1     0     1     1     1     0     0     1     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     0     0     1     0     1     0     1     0     0     0     1     1     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     1     1     1     0     0     0     0     0     1     1     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     1     0     0     1     0     0     0     1     0     0     1 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     1     0     0     0     0     0     0     0     1     1     0     0     0 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     1     1     0     0     1     0     0     1     0     1     1     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     1     1     1     0     0     0     1     1     1     1     0     1 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     1     0     0     1     0     1     1     0     1     0     1     0 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     1     0     1     0     0     1     1     0     1     0     1     0     0 
## 20036 20042 20048 20054 20062 20068 20070 20077 20079 20082 20090 20097 20103 
##     0     0     1     0     0     0     0     1     0     0     1     0     1 
## 20115 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 
##     1     1     0     0     1     1     0     1     0     0     0     0     1 
## 32270 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     1     1     1     1     1     1     1     1     1     1     1     1
conf_matrix <- table(Predicted = predicted_classes, Actual = log.reg.df$ecoli)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0 103   1
##         1   1 102
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_ecoli, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##   threshold sensitivity specificity
## 1 0.1021965   0.9902913   0.9903846
# Kleb

log.reg.df$kleb <- ifelse(reads.meta.counts$klebsiella == 1, 1, 0)  
log.reg.df$kleb[is.na(log.reg.df$kleb)] <- 0  

kleb <- glm(kleb ~ Klebsiella.pneumoniae, data = log.reg.df, family = binomial)  
kleb_probs <- predict(kleb, type = "response")
auc_kleb <- roc(log.reg.df$kleb, kleb_probs)
print(auc_kleb)
## 
## Call:
## roc.default(response = log.reg.df$kleb, predictor = kleb_probs)
## 
## Data: kleb_probs in 204 controls (log.reg.df$kleb 0) < 3 cases (log.reg.df$kleb 1).
## Area under the curve: 0.7198
plot(auc_kleb, col = "blue", main = "ROC Curve for K. pneumoniae")

# Extract optimal information
optimal_threshold <- coords(auc_kleb, "best", ret = "threshold")[1]
predicted_classes <- ifelse(kleb_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = kleb_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     0     0     0     1     0     0     0     0     0     0     0     0     0 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     0     0     0     0     0     0     0     0     0     1     0     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 20036 20042 20048 20054 20062 20068 20070 20077 20079 20082 20090 20097 20103 
##     0     0     0     1     0     0     0     0     0     0     0     0     0 
## 20115 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 32270 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     0     0     0     0     0     0     0     0     0     0     0     0
conf_matrix <- table(Predicted = predicted_classes, Actual = log.reg.df$kleb)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0 203   1
##         1   1   2
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_kleb, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##   threshold sensitivity specificity
## 1 0.2077525   0.6666667    0.995098
# EC

log.reg.df$ec <- ifelse(reads.meta.counts$enterococcus == 1, 1, 0)  
log.reg.df$ec[is.na(log.reg.df$ec)] <- 0  

ec <- glm(ec ~ Enterococcus.faecalis, data = log.reg.df, family = binomial)  
ec_probs <- predict(ec, type = "response")
auc_ec <- roc(log.reg.df$ec, ec_probs)
print(auc_ec)
## 
## Call:
## roc.default(response = log.reg.df$ec, predictor = ec_probs)
## 
## Data: ec_probs in 203 controls (log.reg.df$ec 0) < 4 cases (log.reg.df$ec 1).
## Area under the curve: 0.9778
plot(auc_ec, col = "blue", main = "ROC Curve for E. faecalis")

# Extract optimal information
optimal_threshold <- coords(auc_ec, "best", ret = "threshold")[1]
predicted_classes <- ifelse(ec_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = ec_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     0     0     0     1     0     0     0     0     0     0     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     0     0     0     0     0     0     0     0     0     1     0     0     0 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     0     0     0     0     0     0     0     0     0     0     0     1 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     1     0     0     1     0     0     0     0     0     0     0     0     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     0     0     0     0     0     0     0     0     0     1     0 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     0     0     0     0     0     1     0     0     0     0     0     0     1 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     1     0     0     0     0     0     0     0     0     0     0     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     0     0     0     0     0     0     0     0     1     0     0     0 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     0     0     0     0     0     0     0     1     0     1     0     0     0 
## 20036 20042 20048 20054 20062 20068 20070 20077 20079 20082 20090 20097 20103 
##     1     0     1     0     1     0     0     0     0     0     0     0     0 
## 20115 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 
##     0     0     0     1     0     0     0     0     0     0     0     0     0 
## 32270 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     0     0     0     0     0     0     0     0     0     0     0     0
conf_matrix <- table(Predicted = predicted_classes, Actual = log.reg.df$ec)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0 191   0
##         1  12   4
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_ec, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##     threshold sensitivity specificity
## 1 0.009641515           1   0.9408867
# Proteus

log.reg.df$prot <- ifelse(reads.meta.counts$proteus == 1, 1, 0)  
log.reg.df$prot[is.na(log.reg.df$prot)] <- 0  

prot <- glm(prot ~ Proteus.mirabilis, data = log.reg.df, family = binomial)  
prot_probs <- predict(prot, type = "response")
auc_prot <- roc(log.reg.df$prot, prot_probs)
print(auc_prot)
## 
## Call:
## roc.default(response = log.reg.df$prot, predictor = prot_probs)
## 
## Data: prot_probs in 204 controls (log.reg.df$prot 0) < 3 cases (log.reg.df$prot 1).
## Area under the curve: 0.9984
plot(auc_prot, col = "blue", main = "ROC Curve for P. mirabilis")

# Extract optimal information
optimal_threshold <- coords(auc_prot, "best", ret = "threshold")[1]
predicted_classes <- ifelse(prot_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = prot_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     0     0     0     0     0     0     0     1     0     0     0     0     0 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     0     0     0     0     1     0     0     0     0     0     0     0     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     0     0     0     0     0     0     0     0     1     0     0     0     0 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 20036 20042 20048 20054 20062 20068 20070 20077 20079 20082 20090 20097 20103 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 20115 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 
##     0     0     0     1     0     0     0     0     0     0     0     0     0 
## 32270 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     0     0     0     0     0     0     0     0     0     0     0     0
conf_matrix <- table(Predicted = predicted_classes, Actual = log.reg.df$prot)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0 203   0
##         1   1   3
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_prot, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##   threshold sensitivity specificity
## 1 0.1496063           1    0.995098
# P. aeruginosa

log.reg.df$pa <- ifelse(reads.meta.counts$mainpathogen == "Pseudomonas Aeruginosa", 1, 0)  
log.reg.df$pa[is.na(log.reg.df$pa)] <- 0  

pa <- glm(pa ~ Pseudomonas.aeruginosa, data = log.reg.df, family = binomial)  
pa_probs <- predict(pa, type = "response")
auc_pa <- roc(log.reg.df$pa, pa_probs)
print(auc_pa)
## 
## Call:
## roc.default(response = log.reg.df$pa, predictor = pa_probs)
## 
## Data: pa_probs in 206 controls (log.reg.df$pa 0) < 1 cases (log.reg.df$pa 1).
## Area under the curve: 0.9951
plot(auc_pa, col = "blue", main = "ROC Curve for P. aeruginosa")

# Extract optimal information
optimal_threshold <- coords(auc_pa, "best", ret = "threshold")[1]
predicted_classes <- ifelse(pa_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = pa_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     0     0     0     0     0     0     0     1     0     0     0     0     0 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     0     0     0     0     0     0     0     0     0     0     0     1 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 20036 20042 20048 20054 20062 20068 20070 20077 20079 20082 20090 20097 20103 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 20115 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 
##     0     0     0     0     0     0     0     0     0     0     0     0     0 
## 32270 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     0     0     0     0     0     0     0     0     0     0     0     0
conf_matrix <- table(Predicted = predicted_classes, Actual = log.reg.df$pa)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0 205   0
##         1   1   1
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_pa, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##    threshold sensitivity specificity
## 1 0.08840385           1   0.9951456
  1. Create Bracken-based heatmaps showing species abundance across samples
plot_filtered_rpm_heatmap <- function(rpm_df, abundance_threshold = 1, log_transform = TRUE, cluster_rows = TRUE, cluster_columns = TRUE, top_n_species = NULL) {
  # Check input
  if (!is.data.frame(rpm_df)) stop("Input must be a data frame.")
  
  # Remove any non-numeric columns (e.g., metadata)
  rpm_mat <- rpm_df %>% dplyr::select(where(is.numeric)) %>% as.matrix()
  rownames(rpm_mat) <- rownames(rpm_df)

  # Filter species (columns) based on abundance threshold in any sample
  # Threshold is in percent — so convert to raw RPM
  filtered_mat <- rpm_mat[ , apply(rpm_mat, 2, function(col) any(col > abundance_threshold)), drop = FALSE]

  # Optional: keep only top N most variable species
  if (!is.null(top_n_species)) {
    species_var <- apply(filtered_mat, 2, var)
    top_species <- names(sort(species_var, decreasing = TRUE))[1:min(top_n_species, length(species_var))]
    filtered_mat <- filtered_mat[, top_species, drop = FALSE]
  }

  # Log-transform to reduce skew (optional)
  if (log_transform) {
    filtered_mat <- log10(filtered_mat + 1)  # add 1 to avoid log(0)
  }

  # Heatmap color scale
  col_fun <- colorRamp2(c(min(filtered_mat), max(filtered_mat)), c("white", "red"))

  # Plot heatmap
  Heatmap(filtered_mat,
          name = "RPM",
          col = col_fun,
          show_row_names = FALSE,
          show_column_names = TRUE,
          cluster_rows = cluster_rows,
          cluster_columns = cluster_columns,
          row_title = "214 Patients",
          column_title = "Bracken Output Excluding Human",
          column_names_gp = grid::gpar(fontsize = 4),
          heatmap_legend_param = list(title = ifelse(log_transform, "log10(RPM+1)", "RPM")))
}

without_human <- plot_filtered_rpm_heatmap(data.frame(df_renormalized), abundance_threshold = 10000)
with_human <- plot_filtered_rpm_heatmap(data.frame(rpmdata), abundance_threshold = 500)

print(with_human)

print(without_human)

3. How well does host-response and pathogen abundance overlap?

  1. Gene expression heatmap overlayed with pathogen abundance score
# Create a patient order based on k-means group: positive (1) then negative (0)
ordered_k_groups <- k_groups[order(k_groups, decreasing = TRUE)]
ordered_k_groups_name <- names(ordered_k_groups)
ordered_k_groups <- sapply(ordered_k_groups, function(x) ifelse(x == 0, 2, 1))

# Find up and down genes
sig.genes.up <- rownames(res.df.human[res.df.human$log2FoldChange > 2 & res.df.human$padj < 1e-25 , ])
sig.genes.up <- sig.genes.up[!is.na(sig.genes.up) & !grepl("^NA", sig.genes.up)]

sig.genes.down <- rownames(res.df.human[res.df.human$log2FoldChange < -2 & res.df.human$padj < 1e-25 , ])
sig.genes.down <- sig.genes.down[!is.na(sig.genes.down) & !grepl("^NA", sig.genes.down)]

# Bind these genes together
sig.genes.heatmap <- c(sig.genes.up, sig.genes.down)
sig.genes.heatmap <- sig.genes.heatmap[!is.na(sig.genes.heatmap) & !grepl("^NA", sig.genes.heatmap)]

# Format the z-score-based heatmap fill
temp = t(scale(t(assay(vst.for.heatmap[,])))) 
temp[temp < -4] = -4
temp[temp > +4] = +4
temp[is.na(temp)] = 0
temp <- temp[sig.genes.heatmap , ]

# Add first col annotation- the k-means cluster

col_anno <- HeatmapAnnotation(k_means_group = ordered_k_groups,
                              col = list(ordered_k_groups = c("2" = "blue", "1" = "red")))

# Add second col annotation, unlogged pathogen abundance score bar plot
rpm_sums_to_use <- rpm_sums[ordered_k_groups_name]
col_anno <- HeatmapAnnotation(
  `Unlogged Pathogen\nAbundance Score` = anno_barplot(10^rpm_sums_to_use, gp = gpar(fill = "gray")),
  `k-means cluster` = ordered_k_groups, 
  col = list(`k-means cluster` = c("2" = "blue", "1" = "red")),
  annotation_name_gp = gpar(fontsize = 10, fontface = "bold"),
  annotation_name_rot = 0
)

# Same thing, but remove side and top annotations for the figure
col_anno <- HeatmapAnnotation(
  `Unlogged Pathogen\nAbundance Score` = anno_barplot(10^rpm_sums_to_use, gp = gpar(fill = "gray")),
  annotation_name_gp = gpar(fontsize = 8, fontface = "bold"),
  annotation_name_rot = 0
)

# Side annotation

# Do this for binary now to help for the row annotation
binary.up <- rep("1", length(sig.genes.up))
binary.down <- rep("2", length(sig.genes.down))
binary.combined <- c(binary.up, binary.down)
names(binary.combined) <- sig.genes.heatmap

row_anno <- rowAnnotation(
  `Gene Expression` = binary.combined, 
  col = list(`Gene Expression` = c("2" = "blue", "1" = "red")),
  annotation_name_gp = gpar(fontsize = 8, fontface = "bold"),
  annotation_name_rot = 0, 
  annotation_name_offset = unit(2, "mm")
)

# Generate heatmap without x-axis, y-axis labels, and annotation labels
gene.heatmap <- Heatmap(
  temp[ , ordered_k_groups_name],
  show_column_names = FALSE,  
  show_row_names = FALSE, 
  column_order = ordered_k_groups_name,
  top_annotation = col_anno,
  # right_annotation = row_anno, 
  heatmap_legend_param = list(
    title = "Z-score\ngene expression",
    labels_gp = gpar(fontsize = 10), 
    title_gp = gpar(fontsize = 10, fontface = "bold"),  
    position = "right", 
    legend_gp = gpar(fontsize = 28)  
))

gene.heatmap

  1. Violin plot comparing pathogen abundance scores across both k-means clusters
# Create a data frame for plotting
plot_data <- data.frame(
  k_means_group = factor(ordered_k_groups, levels = unique(ordered_k_groups)), 
  rpm_sums = rpm_sums_to_use
)

# Assign colors based on k-means group position (left = red, right = blue)
plot_data$point_color <- ifelse(as.numeric(plot_data$k_means_group) <= length(unique(ordered_k_groups)) / 2, "red", "blue")
plot_data$k_means_group <- factor(plot_data$k_means_group, levels = c("1", "2"), labels = c("Group 1", "Group 2"))

# Perform a stat test to confirm what p val is for both samples: 
shapiro_results <- by(plot_data$rpm_sums, plot_data$k_means_group, shapiro.test)
print(shapiro_results)
## plot_data$k_means_group: Group 1
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.35907, p-value < 2.2e-16
## 
## ------------------------------------------------------------ 
## plot_data$k_means_group: Group 2
## 
##  Shapiro-Wilk normality test
## 
## data:  dd[x, ]
## W = 0.91013, p-value = 2.03e-06
# Since not normal, follow wilcox test:
wilcox_result <- wilcox.test(rpm_sums ~ k_means_group, data = plot_data)

print(wilcox_result$p.value)
## [1] 4.056181e-29
# Effect size calculation using Cliff's Delta
cliffs_delta <- cliff.delta(plot_data$rpm_sums[plot_data$k_means_group == "Group 1"], 
                            plot_data$rpm_sums[plot_data$k_means_group == "Group 2"])

print(cliffs_delta)
## 
## Cliff's Delta
## 
## delta estimate: 0.9019828 (large)
## 95 percent confidence interval:
##     lower     upper 
## 0.8152732 0.9491320
violin <- ggplot(plot_data, aes(x = k_means_group, y = 10^rpm_sums, fill = k_means_group)) +
  geom_violin(alpha = 0.5, scale = "width") +  # Violin plot
  geom_boxplot(width = 0.2, outlier.shape = NA, color = "black", alpha = 0.7) + 
  geom_jitter(aes(color = point_color), width = 0.2, size = 2, alpha = 0.7) + 
  scale_fill_manual(values = c("red", "blue")) +  
  scale_color_identity() + 
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.border = element_rect(color = "black", fill = NA, linewidth = 1)
  ) +
  labs(x = "K-means Group", y = "Pathogen Abundance (RPM)", title = "Violin Plot of RPM Sums by K-means Group")
violin

  1. ROC and AUC: using pathogen abundance scores to predict k-means clusters
par(pty = "s")
cluster <- ifelse(kmeans_result_pca$cluster == 2, 0, 1)
regression_1 <- glm(cluster ~ rpm_sums, data = data.frame(cluster = cluster, rpm_sums), family = binomial)  
regression_1_probs <- predict(regression_1, type = "response")
auc_regression_1 <- roc(cluster, regression_1_probs)
print(auc_regression_1)
## 
## Call:
## roc.default(response = cluster, predictor = regression_1_probs)
## 
## Data: regression_1_probs in 108 controls (cluster 0) < 99 cases (cluster 1).
## Area under the curve: 0.951
plot(auc_regression_1, col = "blue", main = "Pathogen Abundance Score Predicts RNA-Seq Host Response", asp = 1, lwd = 3)

  1. Immune gene expression levels over pathogen abundance classes
generate.scatter <- function(gene, ensemblID, tpm, rpm_sums, threshold, save = FALSE) {
  
  # Get TPM values for selected gene
  score <- tpm[ensemblID, ]
  
  # Create data frame
  scatter.df <- data.frame(rpm_sums, score)
  cor_test <- cor.test(scatter.df$rpm_sums, scatter.df$score)
  
  # Create scatter plot
  scatter <- ggplot(scatter.df, aes(x = rpm_sums, y = score)) + 
    geom_smooth(method = "lm", se = FALSE, color = "red") + 
    geom_point(size = 3) +
    labs(
      title = paste(gene, "Expression vs. UTI Pathogen Abundance"),
      x = "log10(Pathogen RPM Score)",
      y = paste(gene, "TPM")
    ) +
    theme_minimal() + 
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold title
      text = element_text(size = 14), # Increase overall text size
      panel.border = element_rect(color = "black", fill = NA, size = 1.5) # Add border
    )
  
  # Create a grouping variable for threshold
  scatter.df$group <- ifelse(scatter.df$rpm_sums >= threshold, "Positive", "Negative")
  
  # Create the jitter + boxplot with t-test annotation
  jitter_boxplot <- ggplot(scatter.df, aes(x = group, y = score, color = group, fill = group)) +
    geom_boxplot(outlier.shape = NA, alpha = 0.4, color = "black") +  # Boxplot without outliers
    geom_jitter(width = 0.2, size = 3, alpha = 0.7) +
    scale_color_manual(values = c("Negative" = "blue", "Positive" = "red")) +
    scale_fill_manual(values = c("Negative" = "blue", "Positive" = "red")) +
    labs(
      title = paste(gene, "Expression by Pathogen Abundance Group"),
      x = "Pathogen Group",
      y = paste(gene, "TPM")
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold"),
      text = element_text(size = 14), 
      panel.border = element_rect(color = "black", fill = NA, size = 1.5)
    ) +
    stat_compare_means(method = "t.test", label = "p.format", 
                       label.y = max(scatter.df$score) * 1.05)
  
  # Save plots if needed
  if (save) {
    output_file = paste0(gene, ".pdf")
    pdf(output_file, width = 8, height = 6)
    print(scatter)
    print(jitter_boxplot)
    dev.off()
  }
  
  # Return plots as a list
  return(list(scatter = scatter, jitter = jitter_boxplot))
}

# Use sample genes
NFKB2 <- generate.scatter("NFKB2", "ENSG00000077150", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
JAK3 <- generate.scatter("JAK3", "ENSG00000105639", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
LIMK2 <- generate.scatter("LIMK2", "ENSG00000182541", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
CSF3R <- generate.scatter("CSF3R", "ENSG00000119535", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
IL8 <- generate.scatter("IL8", "ENSG00000169429", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
TNF <- generate.scatter("TNF", "ENSG00000232810", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)
HLAA <- generate.scatter("HLAA", "ENSG00000206503", txi.data$abundance, rpm_sums, threshold = threshold, save = FALSE)

# Print them
print(NFKB2$jitter)

print(JAK3$jitter)

print(LIMK2$jitter)

print(CSF3R$jitter)

print(IL8$jitter)

print(TNF$jitter)

print(HLAA$jitter)

4. How well does RNA-seq agree with clinical UTI diagnostics?

  1. Comparing with clinical: venn diagrams proportional to sizes of groups
# First, create the df containing all four binary classifications (venn.df)

# 1 -> RNA-seq pathogen abundance
rpm_sums_binary <- ifelse(rpm_sums > threshold, 1, 0)

# 2 -> Clinical host response (leukocyte esterase)
enrleuks <- reads.meta.counts$enrleuks
binary.enrleuks <- ifelse(enrleuks == 1 | is.na(enrleuks), 0, 1)

# 3 -> RNA-seq host response (k-means clusters, already computed above in 'cluster' object)

# 4 -> Clinical pathogen abundance
clin <- reads.meta.counts$uti

venn.df <- data.frame(patients.over.thres, 
                      cluster, 
                      rpm_sums_binary, 
                      binary.enrleuks, 
                      clin)

# remove patient 20070, because they do not have leukocyte esterase activity data (n = 213 now)
venn.df <- venn.df %>% filter(patients.over.thres != 20070)

# This will only be used for the ROC analysis predicting leukocyte esterase
rpm_sums_temp <- rpm_sums[names(rpm_sums) != 20070]

# Add an all-zero row to count patients with no group memberships
venn.df <- venn.df[ , c(2, 3, 4, 5)]

colnames(venn.df) <- c("RNA-Seq Host Response Cluster", "RNA-Seq Pathogen Abundance", 
                       "Clinical Host Response Score", "Clinical Pathogen Abundance")

# Generate Euler diagram
new.venn <- euler(venn.df)

# Plot four overlapping categories
plot(new.venn, 
     quantities = TRUE, 
     fills = c("cornflowerblue", "lightcoral", "goldenrod1", "mediumseagreen"),
     alpha = 0.6, 
     labels = list(font = 2, cex = 1.2), 
     edges = list(col = "black", lwd = 2),
     legend = TRUE)

rnaseq_venn <- euler(venn.df[ , c(1,2)])


par(mar = c(5, 5, 5, 5)) 

# Plot RNA-seq host response versus pathogen abundance
plot(rnaseq_venn, 
     quantities = list(cex = 1.5),
     fills = c("red", "blue"), 
     alpha = 0.6,    
     labels = FALSE,
     edges = list(col = "black", lwd = 2),  
     legend = TRUE, 
     quantities.cex = 10) 

# Plot clinical host response versus pathogen abundance
clinical_venn <- euler(venn.df[ , c(3,4)])
plot(clinical_venn, 
     quantities = list(cex = 1.5),
     fills = c("green", "purple"), 
     alpha = 0.6,  
     edges = list(col = "black", lwd = 2), 
     legend = TRUE,
     quantities.cex = 10) 

  1. Comparing with clinical: same venn diagrams (not proportional)
# Create a list of sets for all positive diagnoses across groups
sets <- list(
  "RNA-Seq Host\nResponse Score" = which(venn.df[ , 1] == 1),
  "RNA-Seq\nTaxonomic Score" = which(venn.df[ , 2] == 1),
  "Clinical Host Response\n(Leukocyte Esterase)" = which(venn.df[ , 3] == 1),
  "Clinical Taxonomic\nScore (Culture)" = which(venn.df[ , 4] == 1)
)

# Plot the Venn diagram
venn_plot <- ggvenn(sets, 
       fill_color = c("red", "blue", "green", "purple"), 
       text_size = 5, 
       set_name_size = 5)

# Should add a count for all negative patients
all_zero_count <- sum(rowSums(venn.df) == 0)

# Add annotation to the plot
venn_plot +
  annotate(
    "text", 
    x = 0.5, y = -2,
    label = paste("All zero count:", all_zero_count),
    size = 5,
    hjust = 0.5
  )

# 1) -> Just RNA-Seq

# Create a list of sets
sets1 <- list(
  "RNA-Seq HR" = which(venn.df[ , 1] == 1),
  "RNA-Seq Tax" = which(venn.df[ , 2] == 1))

# Plot the Venn diagram
venn_plot1 <- ggvenn(sets1, 
                    fill_color = c("red", "blue"), 
                    text_size = 4, 
                    set_name_size = 4)

venn_plot1

# 2) -> Just Clinical

# Create a list of sets
sets2 <- list(
  "Clinical HR" = which(venn.df[ , 3] == 1),
  "Clinical Tax" = which(venn.df[ , 4] == 1))

# Plot the Venn diagram
venn_plot2 <- ggvenn(sets2, 
                     fill_color = c("green", "purple"), 
                     text_size = 4, 
                     set_name_size = 4)

venn_plot2

  1. Comparing with clinical: comparing absolute positives and absolute negatives of RNA-seq versus clinical
rnaseq_positives <- rownames(venn.df[venn.df$`RNA-Seq Host Response Cluster` == venn.df$`RNA-Seq Pathogen Abundance` &  venn.df$`RNA-Seq Host Response Cluster` == 1, ])
rnaseq_negatives <- setdiff(rownames(venn.df), rnaseq_positives)

clinical_positives <- rownames(venn.df[venn.df$`Clinical Host Response Score` == venn.df$`Clinical Pathogen Abundance` &  venn.df$`Clinical Host Response Score` == 1, ])
clinical_negatives <- setdiff(rownames(venn.df), clinical_positives)

all_patients <- unique(c(clinical_negatives, clinical_positives, rnaseq_positives, rnaseq_negatives))

patient_df <- data.frame(
  patient_id = all_patients,
  clinical = ifelse(all_patients %in% clinical_positives, "clinical positive", "clinical negative"),
  rnaseq = ifelse(all_patients %in% rnaseq_positives, "RNA-seq positive", "RNA-seq negative")
)
table(patient_df$clinical, patient_df$rnaseq)
##                    
##                     RNA-seq negative RNA-seq positive
##   clinical negative              102                2
##   clinical positive                7               95
venn_sets <- list(
  Clinical_Pos = clinical_positives,
  Clinical_Neg = clinical_negatives,
  RNAseq_Pos = rnaseq_positives,
  RNAseq_Neg = rnaseq_negatives
)

fit <- euler(venn_sets)
# plot(fit, fills = c("#1b9e77", "#d95f02", "#7570b3", "#e7298a"), labels = TRUE, quantities = TRUE)
  1. ROC and AUC: using pathogen abundance scores to predict clinical labels
# Predicting clinical host response
par(pty = "s")
regression_2 <- glm(venn.df$`Clinical Host Response Score` ~ rpm_sums_temp, data = data.frame(`Clinical Host Response Score` = venn.df$`Clinical Host Response Score`, rpm_sums_temp), family = binomial)
regression_2_probs <- predict(regression_2, type = "response")
auc_regression_2 <- roc(venn.df$`Clinical Host Response Score`, regression_2_probs)
print(auc_regression_2)
## 
## Call:
## roc.default(response = venn.df$`Clinical Host Response Score`,     predictor = regression_2_probs)
## 
## Data: regression_2_probs in 99 controls (venn.df$`Clinical Host Response Score` 0) < 107 cases (venn.df$`Clinical Host Response Score` 1).
## Area under the curve: 0.9514
plot(auc_regression_2, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Host Response", lwd = 3)

# Clinical taxonomy
regression_3 <- glm(venn.df$`Clinical Pathogen Abundance` ~ rpm_sums_temp, data = data.frame(`Clinical Pathogen Abundance` = venn.df$`Clinical Pathogen Abundance`, rpm_sums_temp), family = binomial)
regression_3_probs <- predict(regression_3, type = "response")
auc_regression_3 <- roc(venn.df$`Clinical Pathogen Abundance`, regression_3_probs)
print(auc_regression_3)
## 
## Call:
## roc.default(response = venn.df$`Clinical Pathogen Abundance`,     predictor = regression_3_probs)
## 
## Data: regression_3_probs in 96 controls (venn.df$`Clinical Pathogen Abundance` 0) < 110 cases (venn.df$`Clinical Pathogen Abundance` 1).
## Area under the curve: 0.9848
plot(auc_regression_3, col = "blue", main = "Pathogen Abundance Score Predicts Clinical Pathogen", lwd = 3)

# Extract optimal information
optimal_threshold <- coords(auc_regression_3, "best", ret = "threshold")[1]
predicted_classes <- ifelse(regression_3_probs >= optimal_threshold, 1, 0)
predicted_classes <- sapply(X = regression_3_probs, function(x) ifelse(x >= optimal_threshold, 1, 0))
print(predicted_classes)
## 10005 10010 10016 10021 10022 10043 10048 10050 10065 10108 10117 10136 10145 
##     0     0     1     1     1     1     1     0     1     0     0     1     0 
## 10160 10166 10168 10199 10201 10206 10220 10254 10257 10263 10286 10292 10308 
##     1     1     1     1     0     1     0     1     1     0     1     0     1 
## 10315 10318 10346 10354 10365 10369 10377 10380 10383 10385 10389 10393 10395 
##     1     1     1     1     0     1     0     1     1     1     0     0     1 
## 10397 10422 10423 10428 10430 10443 10448 10454 10458 10459 10466 10468 10470 
##     0     1     1     0     0     1     0     1     0     1     1     1     1 
## 10487 10490 10494 10506 10511 10514 10519 10530 10532 10533 10539 10542 10550 
##     1     1     1     1     0     1     1     1     0     1     1     0     0 
## 10554 10555 10581 10584 10596 10608 10622 10625 10644 10650 10656 10659 10660 
##     1     0     1     0     1     0     1     0     0     0     1     1     0 
## 10667 10674 10675 10678 10685 10687 10703 10717 10720 10759 10763 10803 10811 
##     1     1     1     0     0     0     0     0     1     1     0     0     0 
## 10815 10817 10821 10831 10833 10836 10852 10867 10869 10871 10877 10883 10891 
##     0     0     1     0     0     1     0     0     0     1     0     0     1 
## 10892 10896 10897 10900 10901 10913 10927 10940 10967 10969 10973 10992 11003 
##     1     0     0     0     0     0     0     0     1     1     0     0     0 
## 11007 11009 11010 11016 11045 11047 11053 11084 11104 11113 11114 11116 11118 
##     1     1     0     0     1     0     0     1     0     1     1     0     0 
## 11121 11122 11149 11162 11170 11173 11196 11200 11206 11222 11225 11227 11230 
##     0     1     1     1     0     0     0     1     1     1     1     0     1 
## 11233 11248 11256 11272 11275 11281 11285 11297 11311 11314 11317 11323 11329 
##     0     1     0     0     1     0     1     1     0     1     0     1     0 
## 11331 11351 11363 11370 11394 11397 16151 16190 20004 20005 20014 20018 20023 
##     1     0     1     0     0     1     1     0     1     0     1     0     0 
## 20036 20042 20048 20054 20062 20068 20077 20079 20082 20090 20097 20103 20115 
##     0     0     1     1     0     0     1     0     0     1     0     1     1 
## 20122 20123 20134 20140 20142 20147 20149 20150 20159 20161 20163 32194 32270 
##     1     0     0     1     1     0     1     0     0     0     0     1     1 
## 32271 32279 32388 32471 32521 32536 32546 32566 32579 32607 32644 
##     1     1     1     1     1     1     1     1     1     1     1
conf_matrix <- table(Predicted = predicted_classes, Actual = venn.df$`Clinical Pathogen Abundance`)
print(conf_matrix)
##          Actual
## Predicted   0   1
##         0  95   2
##         1   1 108
# Extract the optimal threshold from the ROC curve
optimal_threshold_info <- coords(auc_regression_3, "best", ret = c("threshold", "sensitivity", "specificity"))
print(optimal_threshold_info)
##   threshold sensitivity specificity
## 1 0.5801459   0.9818182   0.9895833
  1. ROC and AUC: using representative host response gene and pathogen scores to predict clinical labels
#' Goal: Find centroid gene in k-means cluster 1 that is closest to the average expression
#' profile of all genes
cluster_1_patients <- rownames(venn.df[venn.df$`RNA-Seq Host Response Cluster` == 1 , ])
cluster_1_TPM <- txi.data$abundance[ , cluster_1_patients]

centroid <- colMeans(cluster_1_TPM)
distances <- apply(cluster_1_TPM, 1, function(gene_expr) {
  sqrt(sum((gene_expr - centroid)^2))
})

# 3. Find the gene with the smallest distance to the centroid
closest_gene <- names(which.min(distances))

# 4. Extract the expression profile of that gene (optional)
closest_gene_profile <- txi.data$abundance["ENSG00000119535", ]
closest_gene_profile <- closest_gene_profile[order(names(closest_gene_profile))]
closest_gene_profile <- closest_gene_profile[names(closest_gene_profile) != "20070"]
length(closest_gene_profile)
## [1] 206
# Predicting clinical host response
par(pty = "s")
regression_4 <- glm(venn.df$`Clinical Host Response Score` ~ closest_gene_profile, data = data.frame(`Clinical Host Response Score` = venn.df$`Clinical Host Response Score`, closest_gene_profile), family = binomial)
regression_4_probs <- predict(regression_4, type = "response")
auc_regression_4 <- roc(venn.df$`Clinical Host Response Score`, regression_4_probs)
print(auc_regression_4)
## 
## Call:
## roc.default(response = venn.df$`Clinical Host Response Score`,     predictor = regression_4_probs)
## 
## Data: regression_4_probs in 99 controls (venn.df$`Clinical Host Response Score` 0) < 107 cases (venn.df$`Clinical Host Response Score` 1).
## Area under the curve: 0.9929
plot(auc_regression_4, col = "blue", main = "CSF3R Expression Predicts Clinical Host Response", lwd = 3)

# Predicting clinical taxonomy
regression_5 <- glm(venn.df$`Clinical Pathogen Abundance` ~ closest_gene_profile, data = data.frame(`Clinical Pathogen Abundance` = venn.df$`Clinical Pathogen Abundance`, closest_gene_profile), family = binomial)
regression_5_probs <- predict(regression_5, type = "response")
auc_regression_5 <- roc(venn.df$`Clinical Pathogen Abundance`, regression_5_probs)
print(auc_regression_5)
## 
## Call:
## roc.default(response = venn.df$`Clinical Pathogen Abundance`,     predictor = regression_5_probs)
## 
## Data: regression_5_probs in 96 controls (venn.df$`Clinical Pathogen Abundance` 0) < 110 cases (venn.df$`Clinical Pathogen Abundance` 1).
## Area under the curve: 0.9697
plot(auc_regression_5, col = "blue", main = "CSF3R Expression Predicts Clinical Pathogen", lwd = 3)

# Predicting RNA-seq host response
par(pty = "s")
regression_6 <- glm(venn.df$`RNA-Seq Host Response Cluster` ~ closest_gene_profile, data = data.frame(`RNA-Seq Host Response Cluster` = venn.df$`RNA-Seq Host Response Cluster`, closest_gene_profile), family = binomial)
regression_6_probs <- predict(regression_6, type = "response")
auc_regression_6 <- roc(venn.df$`RNA-Seq Host Response Cluster`, regression_6_probs)
print(auc_regression_6)
## 
## Call:
## roc.default(response = venn.df$`RNA-Seq Host Response Cluster`,     predictor = regression_6_probs)
## 
## Data: regression_6_probs in 107 controls (venn.df$`RNA-Seq Host Response Cluster` 0) < 99 cases (venn.df$`RNA-Seq Host Response Cluster` 1).
## Area under the curve: 0.9888
plot(auc_regression_6, col = "blue", main = "CSF3R Expression Predicts RNA-seq Host Response", lwd = 3)

# Predicting RNA-seq taxonomy
regression_7 <- glm(venn.df$`RNA-Seq Pathogen Abundance` ~ closest_gene_profile, data = data.frame(`RNA-Seq Pathogen Abundance` = venn.df$`RNA-Seq Pathogen Abundance`, closest_gene_profile), family = binomial)
regression_7_probs <- predict(regression_5, type = "response")
auc_regression_7 <- roc(venn.df$`RNA-Seq Pathogen Abundance`, regression_7_probs)
print(auc_regression_7)
## 
## Call:
## roc.default(response = venn.df$`RNA-Seq Pathogen Abundance`,     predictor = regression_7_probs)
## 
## Data: regression_7_probs in 93 controls (venn.df$`RNA-Seq Pathogen Abundance` 0) < 113 cases (venn.df$`RNA-Seq Pathogen Abundance` 1).
## Area under the curve: 0.966
plot(auc_regression_7, col = "blue", main = "CSF3R Expression Predicts RNA-seq Pathogen", lwd = 3)

  1. Exploring the patients that do not agree
pred.df <- venn.df
 
provide.grouping <- function(MT_HR, MT_Tax, Clin_HR, Clin_Tax) {
  comb <- paste0(MT_HR, MT_Tax, Clin_HR, Clin_Tax)
  # Define meaningful labels for each combination
  group_labels <- list(
    "1111" = "All_Positive",
    "0000" = "All_Negative",
    "1110" = "Clin_Tax_Negative",
    "1101" = "Clin_HR_Negative",
    "1011" = "MT_Tax_Negative",
    "0111" = "MT_HR_Negative",
    "1100" = "Clin_Tax_HR_Negative",
    "1010" = "MT_Tax_Clin_Tax_Negative",
    "1001" = "MT_Tax_Clin_HR_Negative",
    "0110" = "MT_HR_Clin_Tax_Negative",
    "0101" = "MT_HR_Clin_HR_Negative",
    "0011" = "MT_HR_MT_Tax_Negative",
    "1000" = "Only_MT_HR_Positive",
    "0100" = "Only_MT_Tax_Positive",
    "0010" = "Only_Clin_HR_Positive",
    "0001" = "Only_Clin_Tax_Positive"
  )
  # Return the appropriate label or NA if not found
  return(group_labels[[comb]])
}
 
# Apply function to all rows, giving a string value for each
pred.df$group <- mapply(provide.grouping, 
                        pred.df[ , 1],
                        pred.df[ , 2],
                        pred.df[ , 3], 
                        pred.df[ , 4])
 
# Working with df_renormalized, FYI
incorrect_patients <- rownames(pred.df[pred.df$group != "All_Negative" &
                                       pred.df$group != "All_Positive", ])
df_renormalized_subset <- df_renormalized[incorrect_patients , ]
 
# Function that will generate a urobiome plot on this new df
generate.urobiome.plot.with.matrix <- function(rpm, patients, rpm.threshold, metadata) {
  require(ggplot2)
  require(reshape2)
  require(cowplot)
  require(dplyr)
  require(RColorBrewer)
 
  # 1. Subset RPM matrix
  rpm.subsetted <- rpm[patients, ]
  
  # 2. Re-make the function but inputting
  sum.above.thres <- function(rpm.df, rpm.threshold = 1000) {
  
    # Initialize list to store results
    result_list <- list()
  
    for (i in seq_len(nrow(rpm.df))) {
      patient_name <- rownames(rpm.df)[i]
      current.row <- rpm.df[i, ]
  
      # Keep taxa above threshold
      patients.above <- unlist(current.row)[unlist(current.row) >= rpm.threshold]
      sum.rpm.above <- sum(patients.above)
      sum.rpm.below <- 1e6 - sum.rpm.above
  
      # Add "other" category
      patients.above <- c(patients.above, "other" = sum.rpm.below)
  
      # Safety check
      if (abs(sum(patients.above) - 1e6) > 1) stop("RPM must sum to 1 million!")
  
      # Normalize to proportions
      patients.above <- patients.above / sum(patients.above)
  
      # Store as a data frame row
      result_list[[patient_name]] <- as.data.frame(t(patients.above), check.names = FALSE)
    }
  
    # Combine all rows into a single data frame
    result_df <- bind_rows(result_list, .id = "patient")
    return(result_df)
  }
  
  stacked_bar_data <- sum.above.thres(rpm, rpm.threshold = rpm.threshold)
  
  # 3. Melt into long dataframe for plotting
  # stacked_bar_data <- stacked_bar_data %>%
  #   rename_with(~ "Patient", .cols = "patient")  # rename 'patient' column to 'Patient'
  
  plot_data <- stacked_bar_data %>%
    rename_with(~ "Patient", .cols = "patient") %>%
    reshape2::melt(id.vars = "Patient", variable.name = "Taxa", value.name = "Abundance") %>%
    filter(!is.na(Abundance)) %>%
    group_by(Patient) %>%
    mutate(Abundance = Abundance / sum(Abundance)) %>%
    ungroup()
    
  temp_pred_df <- pred.df
  temp_pred_df$Patient <- rownames(temp_pred_df)
    
  plot_data <- plot_data %>%
    left_join(temp_pred_df, by = "Patient")
  
  # Reorder by pred.df$group
  patient_order <- order(plot_data$group)
  plot_data <- plot_data[patient_order , ]
  
  View(plot_data)
  
  # 4. Color setup - ensure we're using the actual taxa names
  taxa_list <- setdiff(unique(as.character(plot_data$Taxa)), "other")
  other_colors <- colorRampPalette(brewer.pal(12, "Set3"))(length(taxa_list))
  fill_colours <- c("other" = "grey", setNames(other_colors, taxa_list))
  
  # 5. Main plot with full legend
  p_main_full <- ggplot(plot_data, aes(x = Patient, y = Abundance, fill = Taxa)) +
    geom_bar(stat = "identity", width = 0.7) +
    coord_flip() +
    scale_fill_manual(values = fill_colours, name = "Taxa") +
    labs(x = "Patient", y = "Proportion of Non-Human Reads") +
    theme_minimal() +
    theme(axis.text.y = element_text(size = 8))
  
  # 6. Extract the legend before removing it
  legend <- cowplot::get_legend(p_main_full + 
                                theme(legend.position = "right",
                                      legend.text = element_text(size = 8)))
  
  # 7. Remove legend from main plot
  p_main <- p_main_full + theme(legend.position = "none")
  
  # 8. Test result matrix
  test_long_names <- c(
    "MT_HR" = "RNA-seq\nHost\nResponse",
    "MT_Tax" = "RNA-seq\nPathogen\nAbundance",
    "Clin_HR" = "Clinical\nHost\nResponse",
    "Clin_Tax" = "Clinical\nPathogen\nAbundance"
  )
  
  test_results <- data.frame(
    Patient = factor(patients, levels = patients),
    MT_HR = pred.df[patients, "RNA-Seq Host Response Cluster"],
    MT_Tax = pred.df[patients, "RNA-Seq Pathogen Abundance"],
    Clin_HR = pred.df[patients, "Clinical Host Response Score"],
    Clin_Tax = pred.df[patients, "Clinical Pathogen Abundance"],
    stringsAsFactors = FALSE
  ) %>%
    reshape2::melt(id.vars = "Patient", variable.name = "Test", value.name = "Status")
  
  p_matrix <- ggplot(test_results, aes(x = Test, y = Patient)) +
    geom_text(aes(label = ifelse(Status == 1, "✓", "x"),
                  color = factor(Status)), 
              size = 4) +
    scale_color_manual(values = c("0" = "red", "1" = "darkgreen")) +
    scale_x_discrete(position = "top", labels = test_long_names) +
    theme_minimal() +
    theme(
      axis.title = element_blank(),
      axis.text.y = element_blank(),
      panel.grid = element_blank(),
      axis.text.x = element_text(angle = 0, hjust = 0.5, size = 9),
      legend.position = "none"
    )
  
  # 9. Align and combine both plots
  aligned_plots <- align_plots(p_main, p_matrix, align = "hv", axis = "lr")
  main_matrix_plot <- plot_grid(
    aligned_plots[[1]],
    aligned_plots[[2]],
    nrow = 1,
    rel_widths = c(3, 3),
    align = "h",
    axis = "tb"
  )
  
  # 10. Return both combined plot and legend
  return(list(
    main_matrix_plot = main_matrix_plot,
    legend = legend
  ))
}
 
# So the checkmarks show up in the plot:
showtext_auto()

# Generate main plot without legend
main_plot <- generate.urobiome.plot.with.matrix(
  df_renormalized_subset, 
  incorrect_patients,
  75000,
  reads.meta.counts
)
 
# Print the main plot
print(main_plot$main_matrix_plot)

# Print the legend separately
grid::grid.newpage()
grid::grid.draw(main_plot$legend)

  1. Colour host-response PCA with clinically-determined culture-based results
# First ensure we keep all samples, including those with NA for mainpathogen
pca_df <- data.frame(
  PC1 = pca_data$PC1,
  PC2 = pca_data$PC2,
  Sample = rownames(pca_data)
  )
pca_df <- merge(pca_df, 
                reads.meta.counts[patients.over.thres, c("ptid", "mainpathogen")], 
                by.x = "Sample", 
                by.y = "ptid",
                all.x = TRUE)

# Create a new column that handles NA values
pca_df$pathogen_group <- ifelse(is.na(pca_df$mainpathogen), 
                                "NA", 
                                as.character(pca_df$mainpathogen))

# Define custom color palette for pathogens (7 colors + grey for NA)
pathogen_colors <- c(
  "E. coli" = "#1F77B4",     # Blue
  "E. coli / Proteus" = "#FF7F0E", # Orange
  "Enterococcus" = "#2CA02C", # Green
  "Klebsiella" = "#D62728", # Red
  "Proteus" = "#9467BD", # Purple
  "Pseudomonas Aeruginosa" = "#8C564B",    # Brown
  "NA" = "#a3a3a3"                     # Grey for missing
)

# Create the plot
pca_plot <- ggplot(pca_df, aes(x = PC1, y = PC2)) +
  # Add cluster ellipses first (so points appear on top)
  stat_ellipse(
    aes(group = cluster, fill = factor(cluster)),
    geom = "polygon",
    alpha = 0.1,
    color = NA,
    show.legend = TRUE, 
    level = 0.95
  ) +
  
  # Add points colored by pathogen
  geom_point(
    aes(color = pathogen_group),
    size = 3,
    alpha = 0.8
  ) +
  
  # Custom color scales
  scale_color_manual(
    values = pathogen_colors,
    na.value = "grey70",
    breaks = names(pathogen_colors)[1:7]
  ) +
  
  scale_fill_manual(
    values = c("Cluster1" = "blue", "Cluster2" = "red"),
    guide = guide_legend(override.aes = list(alpha = 0.3))
  ) +
  
  # Labels and theme
  labs(
    title = "PCA by Pathogen",
    x = "PC1",
    y = "PC2",
    color = "Main Pathogen",
    fill = "Cluster"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    legend.position = "right",
    legend.title = element_text(face = "bold", size = 14),
    legend.text = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

print(pca_plot)

### Now create the same plot with 3D ###

# Perform k-means clustering using 3 PCs
kmeans_result_pca_3d <- kmeans(pca_df_3d[, c("PC1", "PC2", "PC3")], centers = 2, nstart = 25)

# Add cluster assignments to your data frame
pca_df_3d$mainpathogen <- ifelse(is.na(pca_df$mainpathogen), 
                                "NA", 
                                as.character(pca_df$mainpathogen))


# 3D PCA plot with k-means clusters
k_means_plot_3d <- plot_ly(
  data = pca_df_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~mainpathogen,
  colors = pathogen_colors,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with clinical main pathogen", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
k_means_plot_3d
  1. Colour host-response PCA with labels from venn diagram
set.seed(1234)

pca_df_4f <- pca_df
pca_df_4f <- pca_df_4f[pca_df_4f$Sample %in% rownames(pred.df) , ]
if (all(pca_df_4f$Sample == rownames(pred.df))) {
  pca_df_4f$group <- pred.df$group
}

new_cluster <- cluster[-(which(pca_df$Sample == "20070"))]
pca_df_4f$cluster <- kmeans_result_pca$new_cluster

# Create the plot
pca_plot_2 <- ggplot(pca_df_4f, aes(x = PC1, y = PC2)) +
  # Add cluster ellipses first (so points appear on top)
stat_ellipse(
  aes(group = new_cluster, fill = factor(new_cluster)),
  geom = "polygon",
  alpha = 0.3,
  color = "black",    # outline to make visible
  level = 0.95,
  show.legend = TRUE
) +
  
  # Add points colored by pathogen
  geom_point(
    aes(color = group),
    size = 3,
    alpha = 0.8
  ) +
  
  scale_fill_manual(
    values = c("Cluster1" = "blue", "Cluster2" = "red"),
    guide = guide_legend(override.aes = list(alpha = 0.3))
  ) +
  
  # Labels and theme
  labs(
    title = "PCA by venn group",
    x = "PC1",
    y = "PC2",
    color = "Group",
    fill = "Cluster"
  ) +
  
  theme_minimal(base_size = 14) +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 18),
    axis.title = element_text(face = "bold", size = 16),
    axis.text = element_text(size = 12),
    legend.position = "right",
    legend.title = element_text(face = "bold", size = 14),
    legend.text = element_text(size = 12),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(color = "black", fill = NA, size = 1)
  )

pca_plot_2

print(pca_plot_2)

### Now create the same plot with 3D ###

pca_df_4f_3d <- pca_df_3d
pca_df_4f_3d <- pca_df_4f_3d[-(which(rownames(pca_df_4f_3d) == "20070")) , ]
pca_df_4f_3d$group <- pca_df_4f$group

# 3D PCA plot with k-means clusters
pca_plot_4f_3d <- plot_ly(
  data = pca_df_4f_3d,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~group,
  type = "scatter3d",
  mode = "markers",
  marker = list(size = 5, opacity = 0.8)
) %>%
  layout(
    title = list(text = "3D PCA Plot with venn diagram group colours", x = 0.5),
    scene = list(
      xaxis = list(title = "PC1"),
      yaxis = list(title = "PC2"),
      zaxis = list(title = "PC3")
    )
  )

# Display the plot
pca_plot_4f_3d
  1. Take these 8 MT_HR_Negative patients, and compare leukocyte esterase/pathogen score with the all positive group
# First for LE

pca_df_4f$LE_discrete <- reads.meta.counts[-(which(pca_df$Sample == "20070")) , ]$enrleuks

pca_df_4f_filtered <- pca_df_4f[pca_df_4f$group %in% c("All_Positive", "MT_HR_Negative"), ]
comp_1 <- ggplot(pca_df_4f_filtered, aes(x = factor(group), y = LE_discrete, fill = factor(group))) +
  geom_violin(trim = FALSE, show.legend = FALSE) +  # Violin plot with no trim and without legend
  geom_boxplot(width = 0.1, color = "black", fill = "white", alpha = 0.6, outlier.shape = NA) +  # Boxplot inside violins
  labs(x = "Discrepancy Group", y = "Absolute Leukocyte Esterase Activity", title = "Distribution of Factor Across All_Positive and MT_HR_Negative") +
  scale_fill_manual(values = c("All_Positive" = "#1f77b4", "MT_HR_Negative" = "#ff7f0e")) +  # Custom colors for the groups
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.border = element_rect(color = "black", fill = NA, size = 1))  # Add a border around the plot

wilcox.test(LE_discrete ~ group, data = pca_df_4f_filtered)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  LE_discrete by group
## W = 460, p-value = 0.004962
## alternative hypothesis: true location shift is not equal to 0
print(comp_1)

# Next for pathogen abundance

pca_df_4f$rpm_sums <- rpm_sums[names(rpm_sums) != "20070"]
pca_df_4f_filtered <- pca_df_4f[pca_df_4f$group %in% c("All_Positive", "MT_HR_Negative"), ]

comp_2 <- ggplot(pca_df_4f_filtered, aes(x = factor(group), y = 10^rpm_sums, fill = factor(group))) +
  geom_violin(trim = FALSE, show.legend = FALSE) +  # Violin plot with no trim and without legend
  geom_boxplot(width = 0.1, color = "black", fill = "white", alpha = 0.6, outlier.shape = NA) +  # Boxplot inside violins
  labs(x = "Discrepancy Group", y = "Pathogen RPM", title = "Distribution of Factor Across All_Positive and MT_HR_Negative") +
  scale_fill_manual(values = c("All_Positive" = "purple", "MT_HR_Negative" = "darkgreen")) +  # Custom colors for the groups
  theme_minimal() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        panel.border = element_rect(color = "black", fill = NA, size = 1))  # Add a border around the plot

wilcox.test(10^rpm_sums ~ group, data = pca_df_4f_filtered)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  10^rpm_sums by group
## W = 408, p-value = 0.07842
## alternative hypothesis: true location shift is not equal to 0
print(comp_2)

  1. Cell type enrichment with xCell, mapping on all four prediction types
xCell_txi <- txi.data$abundance
rownames(xCell_txi) <- res.df.human$symbol


xcell_results <- xCellAnalysis(xCell_txi)
## [1] "Num. of genes: 10358"
xcell_results_scaled <- xcell_results[ , colnames(xcell_results) != "20070"]
xcell_results_scaled <- t(scale(t(xcell_results_scaled)))

# Ensure the order matches between pred.df and xcell_results_scaled
binary_annots <- pred.df[, 1:4]
binary_annots <- binary_annots[colnames(xcell_results_scaled), ]  # reordering to match

# Step 1: Get ordering based on the annotation column
ordering <- order(pred.df$`RNA-Seq Host Response Cluster`, decreasing = TRUE)

# Step 2: Reorder both the data and annotations
xcell_results_scaled_ordered <- xcell_results_scaled[, ordering]
binary_annots_ordered <- pred.df[ordering, 1:4]

# Step 3: Rebuild the annotation with reordered rows
ha <- HeatmapAnnotation(
  df = binary_annots_ordered,
  col = list(
    `RNA-Seq Host Response Cluster` = c("0" = "grey", "1" = "red"),
    `RNA-Seq Pathogen Abundance` = c("0" = "grey", "1" = "blue"),
    `Clinical Host Response Score` = c("0" = "grey", "1" = "darkgreen"),
    `Clinical Pathogen Abundance` = c("0" = "grey", "1" = "purple")
  ),
  show_annotation_name = TRUE,
  annotation_name_side = "left"
)

# Step 4: Redraw the heatmap with sorted columns
xcell_heatmap <- Heatmap(
  xcell_results_scaled_ordered,
  top_annotation = ha,
  col = colorRamp2(c(-2, 0, 2), c("blue", "white", "red")),
  cluster_columns = FALSE,
  column_names_gp = gpar(fontsize = 0), 
  row_names_gp = gpar(fontsize = 8),   
  name = "xCell score"                
)

# Draw the heatmap
print(xcell_heatmap)

  1. Compute AUC for all cell types across all groups
RNAseq_HR <- pred.df[ , 1]
RNAseq_tax <- pred.df[ , 2]
clin_HR <- pred.df[ , 3]
clin_tax <- pred.df[ , 4]

cell_enrichment_auc <- function(binary) { 
  
  # Prepare storage
  auc_results <- numeric(nrow(xcell_results_scaled))
  names(auc_results) <- rownames(xcell_results_scaled)
  
  # Loop through each cell type
  for (i in seq_len(nrow(xcell_results_scaled))) {
    predictor <- as.numeric(xcell_results_scaled[i, ])  # expression of one cell type across patients
    response <- binary
  
    # Logistic regression model
    model <- glm(response ~ predictor, family = binomial)
    predicted_probs <- predict(model, type = "response")
  
    # Compute AUC
    auc_results[i] <- auc(response, predicted_probs)
  }
  
  return(auc_results)
  
}

RNAseq_HR_auc <- cell_enrichment_auc(RNAseq_HR)
RNAseq_tax_auc <- cell_enrichment_auc(RNAseq_tax)
clin_HR_auc <- cell_enrichment_auc(clin_HR)
clin_tax_auc <- cell_enrichment_auc(clin_tax)

cell_enrichment_auc_df <- data.frame(RNAseq_HR_auc, 
                                        RNAseq_tax_auc, 
                                        clin_HR_auc, 
                                        clin_tax_auc)
colnames(cell_enrichment_auc_df) <- colnames(pred.df[ , c(1, 2, 3, 4)])